-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Calculate exact null distirbution #85
Conversation
The last bottleneck I've found upon increase in data size is building the ranked lists. It is a much less important bottleneck but the optimisations seem simple enough. This shows that the method matches what the curent import duckdb
import numpy as np
from copairs.map.average_precision import build_rank_lists
def build_rank_duckdb(pos_pairs, neg_pairs, pos_sims, neg_sims):
pos_pairs, neg_pairs, pos_sims, neg_sims = pos_pairs.T, neg_pairs.T, pos_sims.T, neg_sims.T
with duckdb.connect(":memory:"):
# Combine relevance labels: 1 for positive pairs and 0 for negative pairs
query = (
"SELECT *,{val} AS label"
" FROM {var}_pairs"
" POSITIONAL JOIN (SELECT #1 AS sim FROM {var}_sims)"
)
pos_table = duckdb.sql(query.format(var="pos", val=1))
neg_table = duckdb.sql(query.format(var="neg", val=0))
joint = pos_table.union(neg_table)
# Pivot to have all indices in one column
pivot_query = (
"UNPIVOT joint"
" ON column0,column1"
" INTO NAME col VALUE ix"
)
pivoted = duckdb.sql(pivot_query)
# Sort first by similarity and then by index number
sort_query = (
"SELECT label"
" FROM pivoted"
" ORDER BY ix ASC,"
" sim DESC"
)
rel_k = duckdb.sql(sort_query)
# Count
count_query = (
"SELECT ix,COUNT(ix) AS counts"
" FROM pivoted"
" GROUP BY ix"
" ORDER BY ix ASC"
)
counted = duckdb.sql(count_query).fetchnumpy()
return counted["ix"],rel_k.fetchnumpy()["label"],counted["counts"].astype(np.uint32)
pos_pairs = np.array([[1,2], [2,3], [4,5]])
pos_sims = np.array([0.99,0.5,0.2])
neg_pairs = pos_pairs + 1
neg_sims = 1-pos_sims
print("Original method, results:")
print("ix: {}\nrel: k{}\ncounts: {}".format(*build_rank_lists(pos_pairs, neg_pairs,pos_sims, neg_sims)))
print("New method, results:")
print("ix: {}\nrel: k{}\ncounts: {}".format(*build_rank_duckdb(pos_pairs, neg_pairs,pos_sims, neg_sims)))
"""
Original method, results:
ix: [1 2 3 4 5 6]
rel: [1 1 1 0 1 0 0 0 1 0 1 0]
counts: [1 3 3 2 2 1]
New method, results:
ix: [1 2 3 4 5 6]
rel: [1 1 1 0 0 1 0 0 1 0 1 0]
counts: [1 3 3 2 2 1]
""" |
The current implementation scales better than a duckdb one. I presume it is due to the amount of operations and re-sortings that we need to do in ddb. So I will leave build_rank_list untouched, but at least we know now. setup = (
"""
import duckdb
import numpy as np
from copairs.map.average_precision import build_rank_lists
from timeit import Timer
pos_pairs = np.random.randint({max_n}, size=({npos},2))
neg_pairs = np.random.randint({max_n}, size=({nneg},2))
pos_sims = np.random.random({npos})
neg_sims = np.random.random({nneg})
def build_rank_duckdb(pos_pairs, neg_pairs, pos_sims, neg_sims):
pos_pairs, neg_pairs, pos_sims, neg_sims = pos_pairs.T, neg_pairs.T, pos_sims.T, neg_sims.T
with duckdb.connect(":memory:"):
# Combine relevance labels: 1 for positive pairs and 0 for negative pairs
query = (
"SELECT *,{{val}} AS label"
" FROM {{var}}_pairs"
" POSITIONAL JOIN (SELECT #1 AS sim FROM {{var}}_sims)"
)
pos_table = duckdb.sql(query.format(var="pos", val=1))
neg_table = duckdb.sql(query.format(var="neg", val=0))
joint = pos_table.union(neg_table)
# Pivot to have all indices in one column
pivot_query = (
"UNPIVOT joint"
" ON column0,column1"
" INTO NAME col VALUE ix"
)
pivoted = duckdb.sql(pivot_query)
# Sort first by similarity and then by index number
sort_query = (
"SELECT label"
" FROM pivoted"
" ORDER BY ix ASC,"
" sim DESC"
)
rel_k = duckdb.sql(sort_query)
# Count
count_query = (
"SELECT ix,COUNT(ix) AS counts"
" FROM pivoted"
" GROUP BY ix"
" ORDER BY ix ASC"
)
counted = duckdb.sql(count_query).fetchnumpy()
return counted["ix"],rel_k.fetchnumpy()["label"],counted["counts"].astype(np.uint32)
"""
)
from itertools import product
max_n = [int(10**i) for i in range(2,8)]
npairs = [int(10**i) for i in range(3,10)]
n_times = 3
n_repeats = 5
for npos,nneg, max_n in product(npairs,npairs,max_n):
for fn in (
"build_rank_duckdb(pos_pairs, neg_pairs, pos_sims, neg_sims)",
"build_rank_lists(pos_pairs, neg_pairs, pos_sims, neg_sims)",
):
times = Timer(fn, setup=setup.format(npos=npos,nneg=nneg, max_n=max_n)).repeat(n_times, n_repeats)
avg = sum(times) / n_repeats
print(f"{fn.split('(')[0]},{npos=},{nneg=},{max_n} has an avg time of {avg:0.3f} secs.")
"""
build_rank_duckdb,npos=1000,nneg=1000,100 has an avg time of 0.069 secs.
build_rank_lists,npos=1000,nneg=1000,100 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,1000 has an avg time of 0.068 secs.
build_rank_lists,npos=1000,nneg=1000,1000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,10000 has an avg time of 0.071 secs.
build_rank_lists,npos=1000,nneg=1000,10000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,100000 has an avg time of 0.070 secs.
build_rank_lists,npos=1000,nneg=1000,100000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,1000000 has an avg time of 0.070 secs.
build_rank_lists,npos=1000,nneg=1000,1000000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=1000,10000000 has an avg time of 0.070 secs.
build_rank_lists,npos=1000,nneg=1000,10000000 has an avg time of 0.001 secs.
build_rank_duckdb,npos=1000,nneg=10000,100 has an avg time of 0.114 secs.
build_rank_lists,npos=1000,nneg=10000,100 has an avg time of 0.006 secs.
build_rank_duckdb,npos=1000,nneg=10000,1000 has an avg time of 0.113 secs.
build_rank_lists,npos=1000,nneg=10000,1000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=10000,10000 has an avg time of 0.115 secs.
build_rank_lists,npos=1000,nneg=10000,10000 has an avg time of 0.008 secs.
build_rank_duckdb,npos=1000,nneg=10000,100000 has an avg time of 0.119 secs.
build_rank_lists,npos=1000,nneg=10000,100000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=10000,1000000 has an avg time of 0.119 secs.
build_rank_lists,npos=1000,nneg=10000,1000000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=10000,10000000 has an avg time of 0.119 secs.
build_rank_lists,npos=1000,nneg=10000,10000000 has an avg time of 0.007 secs.
build_rank_duckdb,npos=1000,nneg=100000,100 has an avg time of 0.555 secs.
build_rank_lists,npos=1000,nneg=100000,100 has an avg time of 0.069 secs.
build_rank_duckdb,npos=1000,nneg=100000,1000 has an avg time of 0.550 secs.
build_rank_lists,npos=1000,nneg=100000,1000 has an avg time of 0.081 secs.
build_rank_duckdb,npos=1000,nneg=100000,10000 has an avg time of 0.569 secs.
build_rank_lists,npos=1000,nneg=100000,10000 has an avg time of 0.094 secs.
build_rank_duckdb,npos=1000,nneg=100000,100000 has an avg time of 0.591 secs.
build_rank_lists,npos=1000,nneg=100000,100000 has an avg time of 0.098 secs.
build_rank_duckdb,npos=1000,nneg=100000,1000000 has an avg time of 0.614 secs.
build_rank_lists,npos=1000,nneg=100000,1000000 has an avg time of 0.103 secs.
build_rank_duckdb,npos=1000,nneg=100000,10000000 has an avg time of 0.621 secs.
build_rank_lists,npos=1000,nneg=100000,10000000 has an avg time of 0.100 secs.
build_rank_duckdb,npos=1000,nneg=1000000,100 has an avg time of 4.839 secs.
build_rank_lists,npos=1000,nneg=1000000,100 has an avg time of 0.813 secs.
build_rank_duckdb,npos=1000,nneg=1000000,1000 has an avg time of 4.894 secs.
""" |
@@ -531,7 +532,7 @@ def get_null_dists( | |||
# Function to generate null distributions for each configuration | |||
def par_func(i): | |||
num_pos, total = confs[i] | |||
null_dists[i] = null_dist_cached(num_pos, total, seeds[i], null_size, cache_dir) | |||
null_dists[i] = get_random_ap(total, num_pos) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm a bit confused by the setting, we need to get a whole distribution here, while get_random_ap
returns a single score.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My bad, it already calculates average the exact expected random average precision for M choose n, the p-value should probably be calculated in a different way, not as "the proportion of null scores >= observed score".
After a chat with @alxndrkalinin turns out getting the expected AP is not that useful. Will close the issue in favour of the current implementation. |
I was gonna give up after #84 but it actually performs very well in real datasets over at the map repo.